# coding: utf8
import json
import re
from pathlib import Path

from core.asr.asr_data_seg import ASRDataSeg
from .asr_data import ASRData


class AsrDataBuilder:

    @staticmethod
    def from_subtitle_file(file_path: str) -> ASRData:
        """
        从文件路径加载ASRData实例。

        Args:
            file_path: 字幕文件路径，支持.srt、.vtt、.ass、.json格式
        Returns:
            ASRData: 解析后的ASRData实例
        Raises:
            ValueError: 不支持的文件格式或文件读取错误
        """
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"文件不存在: {file_path}")

        try:
            content = file_path.read_text(encoding='utf-8')
        except UnicodeDecodeError:
            content = file_path.read_text(encoding='gbk')

        suffix = file_path.suffix.lower()

        if suffix == '.srt':
            return AsrDataBuilder.from_srt(content)
        elif suffix == '.ass':
            return AsrDataBuilder.from_ass(content)
        elif suffix == '.json':
            return AsrDataBuilder.from_json(json.loads(content))
        else:
            raise ValueError(f"不支持的文件格式: {suffix}")

    @staticmethod
    def from_json(json_data: dict) -> ASRData:
        """
        从JSON数据创建ASRData实例。

        Args:
            json_data: JSON字幕数据
        Returns:
            ASRData: 解析后的ASRData实例
        """
        segments = []
        for i in sorted(json_data.keys(), key=int):
            segment_data = json_data[i]
            text = segment_data['original_subtitle']
            if segment_data['translated_subtitle']:
                text += '\n' + segment_data['translated_subtitle']
            segment = ASRDataSeg(
                text=text,
                start_time=segment_data['start_time'],
                end_time=segment_data['end_time']
            )
            segments.append(segment)
        return ASRData(segments)

    @staticmethod
    def from_srt(srt_str: str) -> ASRData:
        """
        从SRT格式的字符串创建ASRData实例。

        Args:
            srt_str: 包含SRT格式字幕的字符串。
        Returns:
                ASRData: 解析后的ASRData实例。
        """
        segments = []
        srt_time_pattern = re.compile(
            r'(\d{2}):(\d{2}):(\d{1,2})[.,](\d{3})\s-->\s(\d{2}):(\d{2}):(\d{1,2})[.,](\d{3})'
        )
        blocks = re.split(r'\n\s*\n', srt_str.strip())

        # 如果超过90%的块都超过4行，说明可能包含翻译文本
        blocks_lines_count = [len(block.splitlines()) for block in blocks]
        if all(count <= 4 for count in blocks_lines_count) and sum(count == 4 for count in blocks_lines_count) / len(
                blocks_lines_count) > 0.9:
            has_translated_subtitle = True
        else:
            has_translated_subtitle = False

        for block in blocks:
            lines = block.splitlines()
            if len(lines) < 3:
                continue

            match = srt_time_pattern.match(lines[1])
            if not match:
                continue

            time_parts = list(map(int, match.groups()))
            start_time = sum([
                time_parts[0] * 3600000,
                time_parts[1] * 60000,
                time_parts[2] * 1000,
                time_parts[3]
            ])
            end_time = sum([
                time_parts[4] * 3600000,
                time_parts[5] * 60000,
                time_parts[6] * 1000,
                time_parts[7]
            ])

            if has_translated_subtitle:
                text = '\n'.join(lines[2:]).strip()
            else:
                text = ' '.join(lines[2:])

            segments.append(ASRDataSeg(text, start_time, end_time))

        return ASRData(segments)

    @staticmethod
    def from_ass(ass_str: str) -> ASRData:
        """
        从ASS格式的字符串创建ASRData实例。

        Args:
            ass_str: 包含ASS格式字幕的字符串。
        Returns:
            ASRData: ASRData实例。
        """
        segments = []
        ass_time_pattern = re.compile(
            r'Dialogue: \d+,(\d+:\d{2}:\d{2}\.\d{2}),(\d+:\d{2}:\d{2}\.\d{2}),(.*?),.*?,\d+,\d+,\d+,.*?,(.*?)$')

        def parse_ass_time(time_str: str) -> int:
            """将ASS时间戳转换为毫秒"""
            hours, minutes, seconds = time_str.split(':')
            seconds, centi_seconds = seconds.split('.')
            return (int(hours) * 3600000 +
                    int(minutes) * 60000 +
                    int(seconds) * 1000 +
                    int(centi_seconds) * 10)

        # 检查是否是VideoCaptioner生成的字幕
        has_translation = "Script generated by VideoCaptioner" in ass_str

        # 用于临时存储相同时间戳的字幕
        temp_segments = {}

        # 按行处理ASS文件
        for line in ass_str.splitlines():
            if line.startswith('Dialogue:'):
                match = ass_time_pattern.match(line)
                if match:
                    start_time = parse_ass_time(match.group(1))
                    end_time = parse_ass_time(match.group(2))
                    style = match.group(3).strip()
                    text = match.group(4)

                    text = re.sub(r'\{[^}]*}', '', text)
                    text = text.replace('\\N', '\n')
                    text = text.strip()

                    if not text:
                        continue

                    if has_translation:
                        # 使用时间戳作为键
                        time_key = f"{start_time}-{end_time}"
                        if time_key in temp_segments:
                            # 如果已存在相同时间戳的字幕，合并原文和译文
                            if style == "Default":
                                temp_segments[time_key] = f"{text}\n{temp_segments[time_key]}"
                            else:
                                temp_segments[time_key] = f"{temp_segments[time_key]}\n{text}"
                            # 创建新的字幕段并清除临时存储
                            segments.append(ASRDataSeg(temp_segments[time_key], start_time, end_time))
                            del temp_segments[time_key]
                        else:
                            temp_segments[time_key] = text
                    else:
                        segments.append(ASRDataSeg(text, start_time, end_time))

        # 处理剩余的未配对字幕
        for time_key, text in temp_segments.items():
            start_time, end_time = map(int, time_key.split('-'))
            segments.append(ASRDataSeg(text, start_time, end_time))

        return ASRData(segments)
